home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Aminet 1 (Walnut Creek)
/
Aminet - June 1993 [Walnut Creek].iso
/
usenet
/
sources
/
volume91
/
libraris
/
sregexp1
/
part02
/
sregexp.c
< prev
next >
Wrap
C/C++ Source or Header
|
1992-04-20
|
17KB
|
663 lines
/* This is a routine to test if a string matches a regular expression.
*
*
* The regular expression sytax is a slight extension of the regular
* AmigaDos wildcard patterns. Apparently 2.0 has added a few new things
* (well, at least a not operator) to the game. This may or may not
* be consistant with those additions (it's the way they otta done it.)
*
* Here it is:
*
* ? matches any one character
*
* #(pat) matches zero or more occurances of the pattern pat.
*
* (pat) seperates out a piece of the pattern.
*
* pat1|pat2 matches to either pat1 or pat2
*
* ' escapes the special meaning of these symbols.
*
* % matches the null string.
*
* EXTENSIONS
*
* [chars] will match any single character in the set chars,
* specified as single characters or ranges seperated
* by a -. Ex. [af-i+] would match a, f, g, h, i, and
* +. If ~ is the first character in the set then the
* set matched is the complement of the set specified.
* Ex. [~a-z] would match any one character that is
* not a (lower case if case sensitivity is on) letter.
* Note that a [~a] is NOT the same as a ~[a]. The former
* would match a 'b' but not a 'ab', whereas the latter
* would match both.
*
* ~(pat) will match anything that doesn't match the pattern.
* Note: it is illegal to repeat a not. ie #~a is illegal.
* (So is #(~(a)) so don't even try it.) You can repeat
* something with a not IN it, as long as it can't be
* reduced to the form #~(pattern). (A #[~a] is OK.)
* However ~#a has the expected effect (matches any
* non-null string not composed entirely of a's.)
*
* * same as #?.
*
*
*/
#include "sregexp.h"
const char wilds[] = { /* string of the wildcards, for easy access */
CHR_REPEAT,
CHR_NOT,
CHR_OPENBRACE,
CHR_CLOSEBRACE,
CHR_OPENSET,
CHR_CLOSESET,
CHR_ANYCHAR,
CHR_NULL,
CHR_OR,
CHR_ESCAPE,
/* CHR_RANGE, <--- special case */
CHR_STAR,
0
};
#ifdef __DEBUG__
extern void puts(char *);
extern void printf(char *, ...);
#endif
struct SregExp *
parsesregexp(expr)
char *expr;
/* This is the entry point into the parsing subroutines. */
{
return parsesub(&expr,0);
}
static struct SregExp *
parsesub(p,end)
char **p,end;
/* This routine will parse a sub expression up until the character
'end' is encontered. This is 0 for the whole pattern and ')'
for a subpattern */
{
struct SregList *list,*head=NULL,*orlist,*orhead=NULL;
struct SregExp *sreg;
int num = 0,ornum = 0;
while (**p != end) {
if (**p == CHR_OR) { /* deal with stuff like (ab||cd) == (ab|%|cd) */
if (!(sreg = makenull()))
goto cleanup;
} else {
if (!(sreg = parseone(p,TRUE))) /* parse one wildcard */
goto cleanup;
}
if (head) { /* if there's already a list, just stick it on */
if (!(list = (list->srl_next = getmem(sizeof(struct SregList))))) {
report(MEM_ERROR);
goto cleanup;
}
} else { /* otherwise, make a new list */
if (!(list = (head = getmem(sizeof(struct SregList))))) {
report(MEM_ERROR);
goto cleanup;
}
}
list->srl_sreg = sreg;
list->srl_next = NULL;
num++;
if (**p == CHR_OR) { /* deal with an or */
if (!(sreg = makesum(head,num)))
goto cleanup;
if (orhead) { /* either add onto the or list */
if (!(orlist = (orlist->srl_next = getmem(sizeof(struct SregList))))) {
report(MEM_ERROR);
goto cleanup;
}
} else { /* or make a new or list */
if (!(orlist = (orhead = getmem(sizeof(struct SregList))))) {
report(MEM_ERROR);
goto cleanup;
}
}
orlist->srl_sreg = sreg;
orlist->srl_next = NULL;
ornum++;
(*p)++;
head = NULL;
num = 0;
}
}
if (head) {
if (!(sreg = makesum(head,num)))
goto cleanup;
} else {
if (!(sreg = makenull()))
goto cleanup;
}
head = NULL;
if (orhead) { /* if this expression had an or, clean it up */
if (!(orlist = (orlist->srl_next = getmem(sizeof(struct SregList))))) {
report(MEM_ERROR);
freesregexp(sreg);
goto cleanup;
}
orlist->srl_sreg = sreg;
orlist->srl_next = NULL;
ornum++;
if (!(sreg = makeor(orhead,ornum)))
goto cleanup;
}
return sreg; /* sub expression successfully matched. */
cleanup: /* troubles all head here to clean up */
list = head;
while (list) { /* free all the bits of the current sum */
head = list->srl_next;
freesregexp(list->srl_sreg);
freemem(list,sizeof(struct SregList));
list = head;
}
list = orhead;
while (list) { /* and all of the current or parts */
head = list->srl_next;
freesregexp(list->srl_sreg);
freemem(list,sizeof(struct SregList));
list = head;
}
return NULL; /* return the failure */
}
static struct SregExp *
makesum(list,num)
struct SregList *list;
int num;
/* This routine takes a linked list of sreg's and joins them together
Under the auspices of one big SRP_SUM type sreg */
{
struct SregList *lnk;
struct SregExp *sreg;
int i,len=0;
char fixed = SRF_FIXLEN;
if (num == 0) {
report(ILLEGAL_ERR);
return NULL;
} else if (num == 1) {
sreg = list->srl_sreg;
freemem(list,sizeof(struct SregList));
return sreg;
} if (!(sreg = getmem(sizeof(struct SregExp) + num*sizeof(struct SregExp *)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_SUM;
sreg->sre_Flag = 0;
sreg->sre_Data.number = num;
for (i = 0; i < num; i++) {
len += realen(list->srl_sreg);
fixed &= isfixed(list->srl_sreg) ? SRF_FIXLEN : 0;
sreg->sre_List[i] = list->srl_sreg;
lnk = list->srl_next;
freemem(list,sizeof(struct SregList));
list = lnk;
}
sreg->sre_MinLen = len;
sreg->sre_Flag |= fixed;
return sreg;
}
static struct SregExp *
makeor(list,num)
struct SregList *list;
int num;
/* This does basically the same thing as makesum, except it makes
an SRP_OR type sreg to join the list and doesn't deal with some
special cases */
{
struct SregList *lnk;
struct SregExp *sreg;
int i,len;
char fixed = SRF_FIXLEN;
if (!(sreg = getmem(sizeof(struct SregExp) + num*sizeof(struct SregExp *)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_OR;
sreg->sre_Flag = 0;
sreg->sre_Data.number = num;
for (i = 0; i < num; i++) {
if (i == 0)
len = realen(list->srl_sreg);
else if (len != realen(list->srl_sreg)) {
fixed = 0;
if (len > realen(list->srl_sreg))
len = realen(list->srl_sreg);
}
fixed &= isfixed(list->srl_sreg) ? SRF_FIXLEN : 0;
sreg->sre_List[i] = list->srl_sreg;
lnk = list->srl_next;
freemem(list,sizeof(struct SregList));
list = lnk;
}
sreg->sre_MinLen = len;
sreg->sre_Flag |= fixed;
return sreg;
}
static struct SregExp *
parseone(p,cat)
char **p,cat;
/* This routine parses one wildcard character from the string *p,
leaving it set up pointing to the begining of the next
wildcard. If 'cat' is true then a string of characters will
be grouped together as a SRP_STRING type sreg. It may be
false because of the scope rules of ~ and #, which only
repeat the very next pattern. */
{
struct SregExp *sreg;
switch (**p) {
case (CHR_REPEAT) :
(*p)++;
if (!(sreg = parseone(p,FALSE)))
return NULL;
if (sreg->sre_Flag & SRF_NOT) {
report(ILLEGAL_ERR);
freesregexp(sreg);
return NULL;
}
sreg->sre_Flag |= SRF_REPEAT;
break;
case (CHR_NOT) :
(*p)++;
if (!(sreg = parseone(p,FALSE)))
return NULL;
sreg->sre_Flag |= SRF_NOT;
break;
case (CHR_OPENBRACE) :
(*p)++;
sreg = parsesub(p,CHR_CLOSEBRACE);
(*p)++;
break;
case (CHR_OPENSET) :
(*p)++;
if (!(sreg = getmem(sizeof(struct SregExp)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_SETCHAR;
sreg->sre_Flag = SRF_FIXLEN;
sreg->sre_MinLen = 1;
if (!(sreg->sre_Data.setchar = makeset(p))) {
freemem(sreg,sizeof(sreg));
return NULL;
}
(*p)++;
break;
case (CHR_ANYCHAR) :
(*p)++;
if (!(sreg = getmem(sizeof(struct SregExp)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_ANYCHAR;
sreg->sre_Flag = SRF_FIXLEN;
sreg->sre_MinLen = 1;
break;
case (CHR_STAR) :
(*p)++;
if (!(sreg = getmem(sizeof(struct SregExp)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_ANYCHAR;
sreg->sre_Flag = SRF_FIXLEN | SRF_REPEAT;
sreg->sre_MinLen = 1;
break;
case (CHR_NULL) :
(*p)++;
if (!(sreg = makenull()))
return NULL;
break;
case (CHR_CLOSEBRACE) :
case (CHR_CLOSESET) :
case (CHR_OR) :
case (0) :
report(ILLEGAL_ERR);
return NULL;
default :
if (!(sreg = getmem(sizeof(struct SregExp)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Flag = SRF_FIXLEN;
if (!cat) {
sreg->sre_Data.onechar = onechar(p,FALSE);
sreg->sre_Type = SRP_ONECHAR;
sreg->sre_MinLen = 1;
} else {
char *q=*p;
int len=0;
while (onechar(&q,FALSE))
len++;
sreg->sre_MinLen = len;
if (len == 1) {
sreg->sre_Data.onechar = onechar(p,FALSE);
sreg->sre_Type = SRP_ONECHAR;
} else {
sreg->sre_Type = SRP_STRING;
if (!(q = sreg->sre_Data.string = getmem(len+1))) {
report(MEM_ERROR);
freemem(sreg,sizeof(sreg));
return NULL;
}
while (*q++ = onechar(p,FALSE)) ;
}
}
}
return sreg;
}
static struct SregExp *
makenull()
/* This routine makes an node matching the null string. */
{
struct SregExp *sreg;
if (!(sreg = getmem(sizeof(struct SregExp)))) {
report(MEM_ERROR);
return NULL;
}
sreg->sre_Type = SRP_NULL;
sreg->sre_Flag = SRF_FIXLEN;
sreg->sre_MinLen = 0;
}
static char
onechar(p,minus)
char **p,minus;
/* this routine picks one character off the string *p. It handles
escaping the special meaning of the wildcard characters, and
returns 0 if we're at the end of the string or the next
char is a wildcard. if 'minus' is true, then the '-' is
treated as a wildcard, otherwise it isn't */
{
if (**p == CHR_ESCAPE)
(*p)++;
else if (strchr(wilds,**p) || (minus && **p == CHR_RANGE))
return 0;
return *(*p)++;
}
static char *
makeset(p)
char **p;
/* this makes a table of match flags from the character specifier
that goes in between [ and ]. It handles a leading '~' and
leaves *p pointing to the closing brace. Note: This routine
checks to make sure the specifier ends in a ] and fails if
it doesn't */
{
char *set,not=FALSE,ok=FALSE,s,e;
if (**p == CHR_NOT) {
(*p)++;
not = TRUE;
}
if (!(set = getmem(32))) {
report(MEM_ERROR);
return NULL;
}
for (s = 0; s < 32; s++)
set[s] = 0;
while (s = onechar(p,TRUE)) {
ok = TRUE;
if (**p == CHR_RANGE) {
(*p)++;
if (!(e = onechar(p,TRUE))) {
report(ILLEGAL_ERR);
freemem(set,32);
return NULL;
}
for ( ; s <= e; s++)
set[s/8] |= 1 << (s % 8);
} else
set[s/8] |= 1 << (s % 8);
}
if (!ok || **p != CHR_CLOSESET) {
report(ILLEGAL_ERR);
freemem(set,32);
return NULL;
}
if (not) {
for(s = 0; s < 32; s++)
set[s] = ~set[s];
}
return set;
}
void
freesregexp(sreg)
struct SregExp *sreg;
/* this routine frees up a sreg. It correctly handles freeing the
subpattern elements in a SRP_SUM or SRP_OR sreg and frees the
match table or string from a SRP_SETCHAR or SRP_STRING sreg.
This is one of the externally visible routines. */
{
int add = 0;
if (sreg->sre_Type == SRP_SUM || sreg->sre_Type == SRP_OR) {
int i;
add = sreg->sre_Data.number * sizeof(struct SregExp *);
for (i = 0; i < sreg->sre_Data.number; i++)
freesregexp(sreg->sre_List[i]);
} else if (sreg->sre_Type == SRP_STRING)
freemem(sreg->sre_Data.string,strlen(sreg->sre_Data.string)+1);
else if (sreg->sre_Type == SRP_SETCHAR)
freemem(sreg->sre_Data.setchar,32);
freemem(sreg,sizeof(struct SregExp) + add);
}
int
matchsregexp(p,sreg,up)
char *p;
struct SregExp *sreg;
int up;
/* This is the externally visible stub to the pattern matching part
of sreg. 'p' is the string to match to, sreg is the preparsed
sreg, and up indicates if the match is case sensitive. */
{
return matchnsregexp(p,sreg,up,strlen(p));
}
int
matchnsregexp(p,sreg,up,len)
char *p;
struct SregExp *sreg;
int up,len;
/* This routine will match a sreg to a string of length 'len'. */
/* The length, rather than NULL terminated is used to make a
lot of things a lot simpler */
{
int res,i;
if (sreg->sre_Flag & SRF_REPEAT) { /* deal with repeaters. */
char not,*h;
if (len == 0 || sreg->sre_Type == SRP_ANYCHAR) { /* always true */
res = TRUE;
goto end;
}
/* check if the lengths match up */
if (len < sreg->sre_MinLen || (sreg->sre_Flag & SRF_FIXLEN &&
(sreg->sre_MinLen == 0 || len % sreg->sre_MinLen != 0))) {
res = FALSE;
goto end;
}
not = sreg->sre_Flag & SRF_NOT;
sreg->sre_Flag &= ~(SRF_REPEAT | SRF_NOT);
if (sreg->sre_Flag & SRF_FIXLEN) { /* handle fixed lenght matches */
h = p;
while (h-p < len) {
if (!(res = matchnsregexp(h,sreg,up,sreg->sre_MinLen)))
goto end;
h += sreg->sre_MinLen;
}
} else { /* and variable length ones */
for (i = len; i >= (sreg->sre_MinLen ? sreg->sre_MinLen : 1); i--) {
if (res = matchnsregexp(p,sreg,up,i)) {
sreg->sre_Flag |= SRF_REPEAT;
if (res = matchnsregexp(p+i,sreg,up,len-i))
break;
sreg->sre_Flag &= ~SRF_REPEAT;
}
}
}
sreg->sre_Flag |= SRF_REPEAT | not;
goto end;
}
/* check the lengths first for quick rejection */
if (len < sreg->sre_MinLen || (sreg->sre_Flag & SRF_FIXLEN && len != sreg->sre_MinLen)) {
res = FALSE;
goto end;
}
switch (sreg->sre_Type) {
case (SRP_SUM) :
res = matchsum(&(sreg->sre_List[0]),sreg->sre_Data.number,p,len,up);
break;
case (SRP_ONECHAR) :
res = len == 1 && (up ? *p == sreg->sre_Data.onechar :
toupper(*p) == toupper(sreg->sre_Data.onechar));
break;
case (SRP_SETCHAR) :
res = len == 1 && matchset(sreg,*p) || (up ? FALSE :
(isupper(*p) ? matchset(sreg,tolower(*p)) :
matchset(sreg,toupper(*p))));
break;
case (SRP_ANYCHAR) :
res = len == 1;
break;
case (SRP_STRING) :
if (up)
res = !strncmp(p,sreg->sre_Data.string,len);
else
res = !strnicmp(p,sreg->sre_Data.string,len);
break;
case (SRP_NULL) :
res = len == 0;
break;
case (SRP_OR) :
for (i = 0; i < sreg->sre_Data.number; i++)
if (res = matchnsregexp(p,sreg->sre_List[i],up,len)) {
break;
}
break;
}
end:
if (sreg->sre_Flag & SRF_NOT)
return !res;
else
return res;
}
static int
matchsum(list,num,p,len,up)
struct SregExp *list[];
int num,len,up;
char *p;
/* This routine is called by match to match the elements of a sum
of sregs. It tries to be as effecient as possible, and keep
recursion to a minimum. It will match any fixed length peices
at the begining and end first, then match any fixed length
peices in the middle to break the string up. Only then does
it do the tiresome matches to the non-fixed length peices. */
{
int i,res,o;
while (num && isfixed(list[0])) {
if (!(res = matchnsregexp(p,list[0],up,list[0]->sre_MinLen)))
return FALSE;
p += list[0]->sre_MinLen;
len -= list[0]->sre_MinLen;
list++;
num--;
}
while (num && isfixed(list[num-1])) {
if (!(res = matchnsregexp(p+len-list[num-1]->sre_MinLen,list[num-1],up,
list[num-1]->sre_MinLen)))
return FALSE;
len -= list[num-1]->sre_MinLen;
num--;
}
if (num == 0)
return TRUE;
o = 0;
for (i = 1; i < num-1; i++) {
if (isfixed(list[i])) {
for ( ; len-o >= list[i]->sre_MinLen; o++) {
if (res = matchnsregexp(p+o,list[i],up,list[i]->sre_MinLen)) {
if (!(res = matchsum(list,i,p,o,up)))
return FALSE;
if (!(res = matchsum(list+(i+1),num-i-1,
p+o+list[i]->sre_MinLen,len-o-list[i]->sre_MinLen,up)))
return FALSE;
return TRUE;
}
}
return FALSE;
} else
o += realen(list[i]);
}
if (num == 1)
return matchnsregexp(p,list[0],up,len);
for (o = len; o >= list[0]->sre_MinLen; o--)
if (matchnsregexp(p,list[0],up,o) && matchsum(list+1,num-1,p+o,len-o,up))
return TRUE;
return FALSE;
}
int
iswild(p)
char *p;
/* this is a very simple routine, but it is externally visible. Returns
true if the string has wildcard characters (unescaped) false if
not */
{
while (onechar(&p,FALSE)) ;
return *p != 0;
}
void
report(i)
int i;
/* This routine now does set the IoErr value to more information.
For now it does nothing. */
{
struct Process *myself;
if (myself = (struct Process *)FindTask(0))
myself->pr_Result2 = i;
}